In [1]:
import pandas as pd
import math
import tabulate as tb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import missingno as msn
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

df=pd.read_csv('water_potability.csv')
In [2]:
########  DATA VISUALIZATION #####
df
Out[2]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
... ... ... ... ... ... ... ... ... ... ...
3271 4.668102 193.681735 47580.991603 7.166639 359.948574 526.424171 13.894419 66.687695 4.435821 1
3272 7.808856 193.553212 17329.802160 8.061362 NaN 392.449580 19.903225 NaN 2.798243 1
3273 9.419510 175.762646 33155.578218 7.350233 NaN 432.044783 11.039070 69.845400 3.298875 1
3274 5.126763 230.603758 11983.869376 6.303357 NaN 402.883113 11.168946 77.488213 4.708658 1
3275 7.874671 195.102299 17404.177061 7.509306 NaN 327.459760 16.140368 78.698446 2.309149 1

3276 rows × 10 columns

In [3]:
df.describe()
Out[3]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
count 2785.000000 3276.000000 3276.000000 3276.000000 2495.000000 3276.000000 3276.000000 3114.000000 3276.000000 3276.000000
mean 7.080795 196.369496 22014.092526 7.122277 333.775777 426.205111 14.284970 66.396293 3.966786 0.390110
std 1.594320 32.879761 8768.570828 1.583085 41.416840 80.824064 3.308162 16.175008 0.780382 0.487849
min 0.000000 47.432000 320.942611 0.352000 129.000000 181.483754 2.200000 0.738000 1.450000 0.000000
25% 6.093092 176.850538 15666.690297 6.127421 307.699498 365.734414 12.065801 55.844536 3.439711 0.000000
50% 7.036752 196.967627 20927.833607 7.130299 333.073546 421.884968 14.218338 66.622485 3.955028 0.000000
75% 8.062066 216.667456 27332.762127 8.114887 359.950170 481.792304 16.557652 77.337473 4.500320 1.000000
max 14.000000 323.124000 61227.196008 13.127000 481.030642 753.342620 28.300000 124.000000 6.739000 1.000000
In [4]:
d= pd.DataFrame(df['Potability'].value_counts())
fig = px.pie(d,values='Potability',names=['Not Potable','Potable'],hole=0.4,opacity=0.6,
            color_discrete_sequence=['#74C365','#51C4D3'],
             labels={'label':'Potability','Potability':'No. Of Samples'})
fig.show()
In [5]:
non_potable = df.query("Potability == 0")
potable = df.query("Potability == 1")
plt.figure(figsize = (15, 15))
for ax, col in enumerate(df.columns[:9]):
    plt.subplot(3, 3, ax + 1)
    plt.title(col)
    sns.kdeplot(x = non_potable[col], label = "Non Potable",color='#4F7942')
    sns.kdeplot(x = potable[col], label = "Potable",color='#51C4D3')
    plt.legend()
plt.tight_layout()
In [6]:
matrix = np.triu(df.corr())
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(), annot=True, fmt= '.3f', mask=matrix,cmap='Blues')
Out[6]:
<AxesSubplot:>
In [7]:
plt.figure(figsize=(7, 10))
heatmap = sns.heatmap(df.corr()[['Potability']].sort_values(by='Potability', ascending=False),annot=True, cmap='GnBu_r')
plt.title('Correlation with Potability',pad=20, fontsize=16)
Out[7]:
Text(0.5, 1.0, 'Correlation with Potability')
In [8]:
##### MISSING VALUES #####
df.isnull().sum()
Out[8]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [9]:
mis_colors = []

for col in df.columns:
    if df[col].isna().sum() != 0:
        mis_colors.append('#51C4D3')
    else:
        mis_colors.append('gray')
msn.bar(df, color=mis_colors)
plt.title('Missing values (before)', size=40, y=1.15)
Out[9]:
Text(0.5, 1.15, 'Missing values (before)')
In [10]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10, weights="uniform")
l=imputer.fit_transform(df)
df=pd.DataFrame(l,columns=df.columns)


mis_colors_after = []

for col in df.columns:
    if df[col].isna().sum() != 0:
        mis_colors_after.append('#51C4D3')
    else:
        mis_colors_after.append('#74C365')
msn.bar(df, color=mis_colors_after)
plt.title('Missing values (after)', size=45, y=1.15)
Out[10]:
Text(0.5, 1.15, 'Missing values (after)')
In [11]:
i=1
plt.figure(figsize=(17,25))
for feature in df.columns.drop('Potability'):
    plt.subplot(6,3,i)
    sns.boxplot(y=df[feature])
    i+=1
In [ ]:
 
In [12]:
X = df.drop('Potability',axis=1)
y = df['Potability'].values

l=[]
names=[]
for col in X.columns:
    names.append(col)
    l.append(f'{math.floor(X[col].min())} to {math.ceil(X[col].max())}')

tab = pd.DataFrame(list(zip(names,l)),columns =['Name', 'Range'])
print(tb.tabulate(tab, headers='keys', tablefmt='pretty'))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
+---+-----------------+--------------+
|   |      Name       |    Range     |
+---+-----------------+--------------+
| 0 |       ph        |   0 to 14    |
| 1 |    Hardness     |  47 to 324   |
| 2 |     Solids      | 320 to 61228 |
| 3 |   Chloramines   |   0 to 14    |
| 4 |     Sulfate     |  129 to 482  |
| 5 |  Conductivity   |  181 to 754  |
| 6 | Organic_carbon  |   2 to 29    |
| 7 | Trihalomethanes |   0 to 124   |
| 8 |    Turbidity    |    1 to 7    |
+---+-----------------+--------------+
In [13]:
#As there are variations between the ranges of the column values, scaling is necessary.
scaler=StandardScaler()
scaler.fit(X_train)
X_train=pd.DataFrame(scaler.transform(X_train))
X_test=pd.DataFrame(scaler.transform(X_test))
In [14]:
from sklearn.metrics import classification_report

param_grid_svc = {'C': [0.01, 0.05, 0.1, 0.5, 1.0, 10.0],'kernel': ['linear','rbf']}
# Create an instance of GridSearch Cross-validation estimator
model=SVC()
gsSVC = GridSearchCV(estimator= model,
                     param_grid = param_grid_svc,
                     scoring='accuracy',
                     cv=10,
                     refit=True)
# Train the SVM classifier
gsSVC.fit(X_train, y_train)

# Print the model parameters of the best model
print(gsSVC.best_params_)
# Print the model score on the test data using GridSearchCV score method
print('Test accuracy: %.3f' % gsSVC.score(X_test, y_test))

print(classification_report(y_test,gsSVC.predict(X_test)))
{'C': 1.0, 'kernel': 'rbf'}
Test accuracy: 0.674
              precision    recall  f1-score   support

         0.0       0.67      0.93      0.78       603
         1.0       0.71      0.27      0.39       380

    accuracy                           0.67       983
   macro avg       0.69      0.60      0.58       983
weighted avg       0.68      0.67      0.63       983

In [15]:
#then calculate the mean and standard deviation of the scores
from sklearn.model_selection import cross_val_score
scores = cross_val_score(SVC(C=1.0,kernel='rbf'), X, y, cv=5,scoring='accuracy')
print(scores)
print("%0.2f accuracy with a standard deviation of %0.3f" % (scores.mean(), scores.std()))
[0.6097561  0.61068702 0.61068702 0.60916031 0.60916031]
0.61 accuracy with a standard deviation of 0.001
In [16]:
from sklearn.neural_network import MLPClassifier
mlp_gs = MLPClassifier(max_iter=1000)
parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
mlp = GridSearchCV(estimator=mlp_gs,
                   param_grid= parameter_space,
                   n_jobs=-1,
                   cv=5,
                   refit=True)
# Train the SVM classifier
mlp.fit(X_train, y_train)

print(mlp.best_params_)
# Print the model score on the test data using GridSearchCV score method
print('Test accuracy: %.3f' % mlp.score(X_test, y_test))

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)
print(classification_report(y_test,mlp.predict(X_test)))
{'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (20,), 'learning_rate': 'constant'}
Test accuracy: 0.667
              precision    recall  f1-score   support

         0.0       0.69      0.83      0.75       603
         1.0       0.60      0.42      0.49       380

    accuracy                           0.67       983
   macro avg       0.65      0.62      0.62       983
weighted avg       0.66      0.67      0.65       983

In [17]:
scores_mlp = cross_val_score(MLPClassifier(max_iter=1000,activation= 'relu', alpha= 0.05, hidden_layer_sizes= (20,), learning_rate= 'constant'), X, y, cv=5,scoring='accuracy')
print(scores_mlp)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_mlp.mean(), scores_mlp.std()))
[0.60365854 0.61068702 0.55877863 0.55877863 0.44427481]
0.56 accuracy with a standard deviation of 0.06
In [18]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

param_grid_forest = {'max_features':[2, 3, 4, 6,8],'max_depth':[2, 3, 4,10]}
# Create an instance of GridSearch Cross-validation estimator
forest=RandomForestClassifier(n_estimators= 100,criterion='gini', random_state=0)
gsForest= GridSearchCV(estimator= forest,
                     param_grid = param_grid_forest,
                     scoring='accuracy',
                     cv=10,
                     refit=True)
#scores = cross_val_score(gsSVC, X, y, cv=5,scoring='accuracy')
#print(scores)
# Train the SVM classifier
gsForest.fit(X_train, y_train)

# Print the model parameters of the best model
print(gsForest.best_params_)
# Print the model score on the test data using GridSearchCV score method
print('Test accuracy: %.3f' % gsForest.score(X_test, y_test))

print(classification_report(y_test,gsForest.predict(X_test)))
{'max_depth': 10, 'max_features': 6}
Test accuracy: 0.678
              precision    recall  f1-score   support

         0.0       0.67      0.92      0.78       603
         1.0       0.70      0.29      0.41       380

    accuracy                           0.68       983
   macro avg       0.69      0.61      0.59       983
weighted avg       0.68      0.68      0.64       983

In [19]:
scores_forest = cross_val_score(RandomForestClassifier(n_estimators= 100,criterion='gini', random_state=0,max_depth= 10, max_features= 6), X, y, cv=5,scoring='accuracy')
print(scores_forest)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_forest.mean(), scores_forest.std()))
[0.6097561  0.64885496 0.65648855 0.61526718 0.66564885]
0.64 accuracy with a standard deviation of 0.02